Take-home Exercise 2

Explore different perspectives and approaches to create more truthful and enlightening data visualisation

Huan Li https://linkedin.com/in/huan-li-ab7498124/ (SMU, SCIS, Master of IT in Business)https://scis.smu.edu.sg/master-it-business/about-mitb-main
2022-05-05

1. Overview

The main purpose of this assignment is to evaluate the take-home exercise 1 from the perspective of clarity and aesthetics, and to refine the diagram based on the principles we learned in the previous two lessons. The dataset of this project is VAST Challenge 2022. The operation was carried out on Rstudio and main packages used are tidyverse and ggplot2 extensions.

2. Data Preparation

We will do the data preparation in a similar procedure of the original creation:

Installing and loading the required libraries

hide
packages = c('tidyverse', 'knitr', 'ggdist', 'scales', 'grid', 'formattable', 'gridExtra', 'patchwork')

for (p in packages){
  if(!require(p, character.only = T)){
    install.package(p)
  }
  library(p, character.only = T)
}
library(patchwork)

Importing the dataset

hide
# read csv file
participants <- read_csv('data/Participants.csv')

Data Wrangling

hide
# rename 'HighSchoolOrCollege'
participants$educationLevel <- sub('HighSchoolOrCollege', 
                                    'High School or College',
                                    participants$educationLevel)

# rename columns
participants <- participants %>%
  rename('ParticipantID' = 'participantId', 
         'HouseholdSize' = 'householdSize', 
         'HaveKids' = 'haveKids', 
         'Age' = 'age', 
         'EducationLevel' = 'educationLevel', 
         'InterestGroup' = 'interestGroup', 
         'Joviality' = 'joviality')

# Age variable is binned with the following code chunk:
brks <- c(17, 20, 25, 30, 35, 40, 45, 50, 55, 60)
grps <- c('20 & Below', '21-25', '26-30', '31-35', '36-40', '41-45','46-50', '51-55', '56-60')
participants$AgeGroup <- cut(participants$Age, breaks=brks, labels = grps)

3. Critique and New Design

3.1 Distribution of Age

A histogram is used to reveal the distribution of residents’ age.

hide
knitr::include_graphics("C:/KatherineHuan/ISSS608/th_ex/data/pic/th2_or1.PNG")

3.1.1 Critique

Clarity

Aesthetic

3.1.2 New design

hide
library(plotly)

# Bar chart plot
Age <- ggplot(data= participants, 
       aes(x= AgeGroup)) +
geom_bar(fill= 'light blue') +
ylim(0, 150) +

# annotation
geom_text(stat = 'count',
          aes(label= paste0(round(stat(count)/sum(stat(count))*100, 
          1), '%')), vjust= -0.5, size= 2.5) +

labs(y= 'No. of\nResidents', x= 'AgeGroup',
     title = "Distribution of Residents' Age",
     subtitle= 'Demographics in Engagement, Ohio',
     caption = "Source: VAST Challenge 2022") +
theme(plot.title = element_text(hjust = 0.5),
      plot.subtitle = element_text(hjust = 0.5),
      axis.title.y= element_text(hjust = 0.5), 
      axis.ticks.x= element_blank(),
      panel.background= element_blank(),
      panel.grid.major = element_line(size= 0.2, color = "grey"),
      axis.line= element_line(color= 'grey'),
      plot.caption = element_text(hjust = 0))
ggplotly(Age)

Clarity

Aesthetic

3.2 Joviality Distribution in Relation to AgeGroup and EducationLevel

Line chart is used to reveal the distribution of residents’ joviality according to their age and education level.

hide
knitr::include_graphics("C:/KatherineHuan/ISSS608/th_ex/data/pic/th2_or2.PNG")

3.2.1 Critique

Clarity

Aesthetic

3.2.2 New design

a. Joviality in relation to Age and Education Level

hide
# Plot Joviality in relation to Age and Education Level
p1 <- ggplot(participants,
       aes(x= AgeGroup, y= Joviality)) +

geom_violin(fill= '#66cdaa',
            scale = 'count',alpha = .9, trim = FALSE,
            color= NA) +
geom_boxplot(width= 0.2,
             color = '#065535',
             alpha= 0.8) +
stat_summary(aes(color= 'Mean'),
             fun= 'mean',
             size= 0.05) + 
geom_hline(aes(yintercept = 0.5),
           color= 'black',
           linetype= 'dashed',
           size= .6) +
ylim(-0.3,1.5) +
# Annotation
scale_color_manual(name= 'Statistics',
                   values = (Mean= '#f6546a')) +
# Add on Education Level Factor
facet_grid(~factor(EducationLevel)) +
labs(title= 'Joviality Distribution in Relation to Age',
     x= 'Age Group', y= 'Joviality',
     subtitle= 'Demographics in Engagement, Ohio',
     caption = "Source: VAST Challenge 2022") +
theme(panel.background = element_blank(),
      plot.title = element_text(hjust = 0.5),
      plot.subtitle = element_text(hjust = 0.5),
      axis.title.y = element_text(angle=0,vjust = 0.5),
      axis.ticks.x = element_blank(),
      axis.line= element_line(color= 'grey'),
      axis.text.x = element_text(size=8,angle=90),
      panel.grid.major.y = element_line(color= 'grey', size = 0.1),
      plot.caption = element_text(hjust=0),
      legend.key = element_rect(fill= NA))
p1

To detect the relationship between every two variables, we will plot another 3 charts for better understand how is the incluence in between.

b. Joviality in relation to Age

hide
# Plot Joviality in relation to Age
p2 <- ggplot(participants,
       aes(x= AgeGroup, y= Joviality)) +

geom_violin(fill= '#66cdaa',
            scale = 'count',alpha = .9, trim = FALSE,
            color= NA) +
geom_boxplot(width= 0.2,
             color = '#065535',
             alpha= 0.8) +
stat_summary(aes(color= 'Mean'),
             fun= 'mean',
             size= 0.2) + 
geom_hline(aes(yintercept = 0.5),
           color= 'black',
           linetype= 'dashed',
           size= .6) +

# Annotation
scale_color_manual(name= 'Statistics',
                   values = (Mean= '#f6546a')) +
labs(title= 'Joviality Distribution in Relation to Age',
     x= 'Age Group', y= 'Joviality',
     subtitle= 'Demographics in Engagement, Ohio',
     caption = "Source: VAST Challenge 2022") +
theme(panel.background = element_blank(),
      plot.title = element_text(hjust = 0.5),
      plot.subtitle = element_text(hjust = 0.5),
      axis.title.y = element_text(angle=0,vjust = 0.5),
      axis.ticks.x = element_blank(),
      axis.line= element_line(color= 'grey'),
      axis.text.x = element_text(size=8,angle=0),
      panel.grid.major.y = element_line(color= 'grey', size = 0.1),
      plot.caption = element_text(hjust=0),
      legend.key = element_rect(fill= NA))
p2

c. Joviality in relation to Education Level

hide
# Plot Joviality in relation to Education Level
p3 <-  ggplot(participants,
       aes(x= EducationLevel, y= Joviality)) +

geom_violin(fill= '#66cdaa',
            scale = 'count',alpha = .9, trim = FALSE,
            color= NA) +
geom_boxplot(width= 0.08,
             color = '#065535',
             alpha= 0.8) +
stat_summary(aes(color= 'Mean'),
             fun= 'mean',
             size= 0.2) + 
geom_hline(aes(yintercept = 0.5),
           color= 'black',
           linetype= 'dashed',
           size= .6) +

# Annotation
scale_color_manual(name= 'Statistics',
                   values = (Mean= '#f6546a')) +
# Add on Education Level Factor
#facet_grid(~factor(AgeGroup)) +
labs(title= 'Joviality Distribution in Relation to Education Level',
     x= 'Education Level', y= 'Joviality',
     subtitle= 'Demographics in Engagement, Ohio',
     caption = "Source: VAST Challenge 2022") +
theme(panel.background = element_blank(),
      plot.title = element_text(hjust = 0.5),
      plot.subtitle = element_text(hjust = 0.5),
      axis.title.y = element_text(angle=0,vjust = 0.5),
      axis.ticks.x = element_blank(),
      axis.line= element_line(color= 'grey'),
      axis.text.x = element_text(size=8,angle=0),
      panel.grid.major.y = element_line(color= 'grey', size = 0.1),
      plot.caption = element_text(hjust=0),
      legend.key = element_rect(fill= NA))
p3

d. Education Level in relation to Age

hide
# Data manipulation
percent <-
  participants %>% 
  group_by(AgeGroup, EducationLevel) %>%
  summarise(edu_size= n()) %>%
  mutate(edu_pct= percent(edu_size/sum(edu_size))) 

#Plot Education Level in relation to Age
p4 <- ggplot(data= percent, 
       aes(x= factor(AgeGroup), 
           y= edu_pct,
           group= EducationLevel,
           color= factor(EducationLevel))) +
geom_line() +
scale_color_discrete(name= 'Education Level') +
scale_y_continuous(labels = percent_format(),
                   expand = c(0.2, 0.2)) +
labs(x= 'Age Group',y= 'Propotions',
     title = "Composition of Residents' Education Level in Age Groups")+
theme(legend.position = 'top', legend.direction = 'horizontal',
      axis.title.y= element_text(angle=90), axis.ticks.x= element_blank(),
      panel.background= element_blank(), axis.line= element_line(color= 'grey'),
      panel.grid.major = element_line(size= 0.2, color = "grey"),
      legend.key = element_rect(fill= NA), legend.title = element_text(size = 8.5),
      plot.title = element_text(hjust=0.5))
p4

Clarity

Aesthetic

4. Conclusions